# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# CROSS-VALIDATION

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install statsmodels;
! pip install scikit-learn;
! pip install ISLP;

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from ISLP import load_data

# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')

# 1. Validation Set Approach

# Select predictor and response variables
X = Auto[['weight', 'horsepower', 'acceleration']].values
y = Auto['mpg'].values

# Set a random seed and take a random subsample (size 250)
np.random.seed(42)
n = len(y)
Z = np.random.choice(n, 250, replace=False)  # Random subsample

# Fit linear regression using training data
reg = LinearRegression()
reg.fit(X[Z], y[Z])

# Predict on the testing data (those not in Z)
test_indices = np.setdiff1d(np.arange(n), Z)  # Get indices that are not in Z
mpg_predicted = reg.predict(X[test_indices])

# Calculate MSE
mse_validation = mean_squared_error(y[test_indices], mpg_predicted)
print(f"Validation MSE: {mse_validation}")

Validation MSE: 14.044255907353062

Yhat = np.zeros(n)

for i in range(n):
    X_train = np.delete(X, i, axis=0)
    y_train = np.delete(y, i, axis=0)
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    Yhat[i] = reg.predict(X[i].reshape(1, -1))[0]

# 2. Jackknife (Leave-One-Out Cross-Validation, LOOCV)

# Manual LOOCV

Yhat = np.zeros(n)

for i in range(n):
    X_train = np.delete(X, i, axis=0)
    y_train = np.delete(y, i, axis=0)
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    Yhat[i] = reg.predict(X[i].reshape(1, -1))[0]

# Calculate MSE for LOOCV

mse_loocv_manual = mean_squared_error(y, Yhat)
print(f"LOOCV MSE (manual): {mse_loocv_manual}")

# Using cross_val_score from sklearn for LOOCV
loo = LeaveOneOut()
reg = LinearRegression()
mse_loocv_sklearn = -np.mean(cross_val_score(reg, X, y, cv=loo, scoring='neg_mean_squared_error'))
print(f"LOOCV MSE (sklearn): {mse_loocv_sklearn}")

LOOCV MSE (manual): 18.25595080469144
LOOCV MSE (sklearn): 18.255950804691444

# 3. K-Fold Cross-Validation
kf = KFold(n_splits=30, shuffle=True, random_state=42)
cv_error = []

# Polynomial regression and cross-validation
for p in range(1, 11):
    poly = PolynomialFeatures(degree=p)
    X_poly = poly.fit_transform(Auto[['weight', 'horsepower', 'acceleration']])
    reg = LinearRegression()
    mse_kfold = -np.mean(cross_val_score(reg, X_poly, y, cv=kf, scoring='neg_mean_squared_error'))
    cv_error.append(mse_kfold)

# Display errors for each degree
print(cv_error)

# Find the degree with the minimum error
min_error_degree = np.argmin(cv_error) + 1  # +1 because Python index starts from 0
print(f"Degree with minimum error: {min_error_degree}")

[18.27445365338935, 15.968997263479562, 19.66681010976254, 20.537052894764713, 37.131761204331504, 29.717233687052477, 2602.9910843513617, 2402.2450938767124, 323.7778722471489, 335.79439151783146]
Degree with minimum error: 2